load libs

library(splines)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(DBI)

Load exploration

# please uncoments the following code to download the database from github if 
# do not want to download the file from github manually nor run the download or explore
# notebook

# fileURL <- "https://github.com/ccb-hms/Imputation/blob/main/nhanes.sqlite"
# if(!file.exists("nhanes.sqlite")){
#     res <- tryCatch(download.file(fileURL,
#                               destfile="./nhanes.sqlite",
#                               method="auto"),
#                 error=function(e) 1)
# }


nhanes_db <- dbConnect(RSQLite::SQLite(), "nhanes.sqlite")

# list all of the tables
dbListTables(nhanes_db)
## [1] "blood_cholesterol"     "body_measures"         "current_health_status"
## [4] "demo"                  "diabetes"              "diet_total"           
## [7] "medical_conditions"    "merged_table"
cols <- 'BMXWAIST , RIDAGEYR, BMXHT, BMXWT, RIAGENDR, years, DR1TM161, WTDRD1, BMXLEG, BMXARML '
data_sql <- paste0('SELECT ', cols, 'FROM merged_table')

dbListTables(nhanes_db)
## [1] "blood_cholesterol"     "body_measures"         "current_health_status"
## [4] "demo"                  "diabetes"              "diet_total"           
## [7] "medical_conditions"    "merged_table"
data <- dbGetQuery(nhanes_db, data_sql)
data <- na.omit(data)

dbDisconnect(nhanes_db)


train_ix <- sample(x = 1:nrow(data), size = 2500)
test_ix <- sample(x = setdiff(1:nrow(data), train_ix), 1200)

train_data <- data[train_ix, ]
test_data <- data[test_ix, ]

Inverse Normal Distribution

invNorm <- function(x) {qnorm((rank(x) - 3/8)/(length(x) +1 - 6/8))}

mean_square_error <- function(y_true, y_pred){
    round(mean((y_true - y_pred)^2),4)
}

plot_density <- function(data,data_name,col='red'){
    d <- density(data)
    plot(d,main=paste(data_name,"Density"))
    polygon(d, col=col, border="blue")
  }

Load exploration

qplot(x=BMXHT,y=BMXWAIST,data=data,colour=RIAGENDR,alpha=I(0.1))

qplot(x=RIDAGEYR,y=BMXWAIST,data=data,colour=RIAGENDR,alpha=I(0.1))

qplot(x=BMXWT,y=BMXWAIST,data=data,colour=RIAGENDR,alpha=I(0.1))

regression models

test_df <- test_data |> select(-BMXWAIST)



run_model <- function(formula_str,train_data_set=train_data,
                      test_data_set=test_df){
  # setup regression model
  lm_reg = lm(formula = as.formula(formula_str), train_data_set)
  print(summary(lm_reg))
  
  # run prediction
  lm_pred = predict(lm_reg, newdata = test_df, se = T)
  
  # save prediction results
  pred_df = data.frame(
    fit = lm_pred$fit,
    weight = test_data$BMXWT,
    sex = test_data$RIAGENDR,
    label = test_data$BMXWAIST
  )
  # compute MSE
  mse<- mean_square_error(pred_df$fit, pred_df$label)
  
  #plot results
  g <-  ggplot(pred_df, aes(x = weight, y = label)) + geom_point(colour = "black",alpha = 0.1) +
    geom_point(aes(x = weight, y = fit, colour = sex,alpha = 0.1),
              size = 1.5) + ylab("waist circumference")
  
  g+ggtitle(paste("MSE = ",mse))
}


run_model("BMXWAIST ~ BMXWT")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.7268  -4.9769  -0.2348   4.9041  29.2490 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 43.75908    0.59288   73.81   <2e-16 ***
## BMXWT        0.68446    0.00704   97.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.253 on 2498 degrees of freedom
## Multiple R-squared:  0.7909, Adjusted R-squared:  0.7909 
## F-statistic:  9451 on 1 and 2498 DF,  p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(BMXWT)")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26.1660  -4.7950  -0.2237   4.8580  28.9910 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   66.589      1.020   65.27   <2e-16 ***
## bs(BMXWT)1    38.225      3.036   12.59   <2e-16 ***
## bs(BMXWT)2    72.230      2.532   28.52   <2e-16 ***
## bs(BMXWT)3    90.265      4.069   22.18   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.208 on 2496 degrees of freedom
## Multiple R-squared:  0.7937, Adjusted R-squared:  0.7934 
## F-statistic:  3201 on 3 and 2496 DF,  p-value: < 2.2e-16
## Warning in bs(BMXWT, degree = 3L, knots = numeric(0), Boundary.knots = c(37, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + years")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.6713  -4.1404  -0.1093   4.1705  26.5294 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           37.193372   0.872496  42.629  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1  1.337644   1.317056   1.016  0.30990    
## bs(RIDAGEYR, df = 7)2  1.362764   0.942823   1.445  0.14847    
## bs(RIDAGEYR, df = 7)3  3.322829   1.015300   3.273  0.00108 ** 
## bs(RIDAGEYR, df = 7)4  5.692912   0.887975   6.411 1.72e-10 ***
## bs(RIDAGEYR, df = 7)5  9.271967   1.138759   8.142 6.08e-16 ***
## bs(RIDAGEYR, df = 7)6 10.340631   1.206769   8.569  < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 12.510724   1.239543  10.093  < 2e-16 ***
## BMXWT                  0.726116   0.006218 116.774  < 2e-16 ***
## RIAGENDRMale          -4.714737   0.252704 -18.657  < 2e-16 ***
## years2005-2006        -0.217119   0.523583  -0.415  0.67841    
## years2007-2008        -0.442658   0.493215  -0.897  0.36954    
## years2009-2010         0.332303   0.486776   0.683  0.49489    
## years2013-2014         1.029248   0.501634   2.052  0.04029 *  
## years2015-2016         1.491400   0.515688   2.892  0.00386 ** 
## years2017-2018         0.364910   0.505424   0.722  0.47037    
## years2022-2012        -0.048247   0.506252  -0.095  0.92408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.045 on 2483 degrees of freedom
## Multiple R-squared:  0.8556, Adjusted R-squared:  0.8547 
## F-statistic: 919.7 on 16 and 2483 DF,  p-value: < 2.2e-16

run_model("BMXWAIST ~ ns(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + years")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.9563  -4.1284  -0.1112   4.1327  26.5222 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           37.077231   0.793335  46.736  < 2e-16 ***
## ns(RIDAGEYR, df = 7)1  1.196274   0.774614   1.544  0.12263    
## ns(RIDAGEYR, df = 7)2  4.899847   0.982551   4.987 6.56e-07 ***
## ns(RIDAGEYR, df = 7)3  4.941640   0.839101   5.889 4.41e-09 ***
## ns(RIDAGEYR, df = 7)4  8.011813   0.862736   9.287  < 2e-16 ***
## ns(RIDAGEYR, df = 7)5  8.597619   0.831967  10.334  < 2e-16 ***
## ns(RIDAGEYR, df = 7)6 13.486953   1.435079   9.398  < 2e-16 ***
## ns(RIDAGEYR, df = 7)7 11.012311   0.824097  13.363  < 2e-16 ***
## BMXWT                  0.726016   0.006214 116.828  < 2e-16 ***
## RIAGENDRMale          -4.714870   0.252478 -18.674  < 2e-16 ***
## years2005-2006        -0.252181   0.522454  -0.483  0.62936    
## years2007-2008        -0.452088   0.489407  -0.924  0.35571    
## years2009-2010         0.309566   0.481554   0.643  0.52038    
## years2013-2014         1.013463   0.498535   2.033  0.04217 *  
## years2015-2016         1.480495   0.511400   2.895  0.00382 ** 
## years2017-2018         0.338513   0.501048   0.676  0.49935    
## years2022-2012        -0.063646   0.501525  -0.127  0.89903    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.041 on 2483 degrees of freedom
## Multiple R-squared:  0.8558, Adjusted R-squared:  0.8549 
## F-statistic: 921.1 on 16 and 2483 DF,  p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + bs(BMXWT) + RIAGENDR + BMXHT")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.6329  -3.1637  -0.0372   3.3221  23.9405 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           132.18669    2.50117  52.850  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1   1.66522    1.10475   1.507  0.13185    
## bs(RIDAGEYR, df = 7)2   1.40088    0.79283   1.767  0.07736 .  
## bs(RIDAGEYR, df = 7)3   2.71895    0.85240   3.190  0.00144 ** 
## bs(RIDAGEYR, df = 7)4   4.67823    0.74466   6.282 3.92e-10 ***
## bs(RIDAGEYR, df = 7)5   8.18036    0.95349   8.579  < 2e-16 ***
## bs(RIDAGEYR, df = 7)6   8.32671    1.00634   8.274  < 2e-16 ***
## bs(RIDAGEYR, df = 7)7  10.37791    1.02542  10.121  < 2e-16 ***
## bs(BMXWT)1             50.68186    2.18057  23.243  < 2e-16 ***
## bs(BMXWT)2             81.89221    1.80444  45.384  < 2e-16 ***
## bs(BMXWT)3            107.21009    2.90621  36.890  < 2e-16 ***
## RIAGENDRMale            0.53331    0.28350   1.881  0.06007 .  
## BMXHT                  -0.46426    0.01511 -30.731  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.081 on 2487 degrees of freedom
## Multiple R-squared:  0.8978, Adjusted R-squared:  0.8974 
## F-statistic:  1822 on 12 and 2487 DF,  p-value: < 2.2e-16
## Warning in bs(BMXWT, degree = 3L, knots = numeric(0), Boundary.knots = c(37, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + BMXHT + years")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.3658  -3.3131  -0.0423   3.3434  23.6241 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           106.384953   2.493452  42.666  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1   1.878036   1.137542   1.651 0.098873 .  
## bs(RIDAGEYR, df = 7)2   1.242629   0.814219   1.526 0.127097    
## bs(RIDAGEYR, df = 7)3   3.022987   0.876859   3.448 0.000575 ***
## bs(RIDAGEYR, df = 7)4   4.843558   0.767397   6.312 3.26e-10 ***
## bs(RIDAGEYR, df = 7)5   8.449445   0.983821   8.588  < 2e-16 ***
## bs(RIDAGEYR, df = 7)6   8.664630   1.043737   8.302  < 2e-16 ***
## bs(RIDAGEYR, df = 7)7  10.307882   1.073123   9.606  < 2e-16 ***
## BMXWT                   0.788480   0.005781 136.381  < 2e-16 ***
## RIAGENDRMale            0.863035   0.290412   2.972 0.002989 ** 
## BMXHT                  -0.453963   0.015595 -29.110  < 2e-16 ***
## years2005-2006         -0.035627   0.452201  -0.079 0.937209    
## years2007-2008         -0.705950   0.426029  -1.657 0.097636 .  
## years2009-2010         -0.142005   0.420688  -0.338 0.735729    
## years2013-2014          0.506923   0.433575   1.169 0.242448    
## years2015-2016          0.543725   0.446528   1.218 0.223464    
## years2017-2018         -0.387545   0.437242  -0.886 0.375519    
## years2022-2012         -0.269318   0.437257  -0.616 0.538001    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.221 on 2482 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8916 
## F-statistic:  1211 on 17 and 2482 DF,  p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + invNorm(BMXWT) + RIAGENDR + BMXHT + years")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.7234  -3.5551  -0.1187   3.4965  22.3705 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           170.99494    2.74548  62.282  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1   1.50848    1.18498   1.273  0.20314    
## bs(RIDAGEYR, df = 7)2   1.57826    0.84807   1.861  0.06286 .  
## bs(RIDAGEYR, df = 7)3   2.55800    0.91360   2.800  0.00515 ** 
## bs(RIDAGEYR, df = 7)4   4.53068    0.79954   5.667 1.63e-08 ***
## bs(RIDAGEYR, df = 7)5   7.99598    1.02499   7.801 8.97e-15 ***
## bs(RIDAGEYR, df = 7)6   8.25411    1.08726   7.592 4.44e-14 ***
## bs(RIDAGEYR, df = 7)7  10.19473    1.11783   9.120  < 2e-16 ***
## invNorm(BMXWT)         16.30449    0.12525 130.179  < 2e-16 ***
## RIAGENDRMale            0.02846    0.30226   0.094  0.92498    
## BMXHT                  -0.45234    0.01625 -27.832  < 2e-16 ***
## years2005-2006          0.15296    0.47111   0.325  0.74545    
## years2007-2008         -0.57111    0.44378  -1.287  0.19824    
## years2009-2010         -0.21686    0.43822  -0.495  0.62073    
## years2013-2014          0.76372    0.45162   1.691  0.09095 .  
## years2015-2016          0.68518    0.46511   1.473  0.14084    
## years2017-2018         -0.03682    0.45538  -0.081  0.93556    
## years2022-2012         -0.04154    0.45549  -0.091  0.92734    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.438 on 2482 degrees of freedom
## Multiple R-squared:  0.8832, Adjusted R-squared:  0.8824 
## F-statistic:  1104 on 17 and 2482 DF,  p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + invNorm(BMXHT) + years")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.5232  -3.2377  -0.0766   3.4189  24.1107 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           30.744141   0.790494  38.892  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1  1.849644   1.143505   1.618 0.105893    
## bs(RIDAGEYR, df = 7)2  1.177541   0.818511   1.439 0.150379    
## bs(RIDAGEYR, df = 7)3  3.045851   0.881457   3.455 0.000559 ***
## bs(RIDAGEYR, df = 7)4  4.792176   0.771517   6.211 6.14e-10 ***
## bs(RIDAGEYR, df = 7)5  8.480503   0.988970   8.575  < 2e-16 ***
## bs(RIDAGEYR, df = 7)6  8.650568   1.049297   8.244 2.66e-16 ***
## bs(RIDAGEYR, df = 7)7 10.183105   1.079167   9.436  < 2e-16 ***
## BMXWT                  0.786456   0.005798 135.638  < 2e-16 ***
## RIAGENDRMale           0.597438   0.287836   2.076 0.038032 *  
## invNorm(BMXHT)        -4.465009   0.156623 -28.508  < 2e-16 ***
## years2005-2006        -0.048951   0.454572  -0.108 0.914253    
## years2007-2008        -0.704234   0.428269  -1.644 0.100226    
## years2009-2010        -0.161861   0.422936  -0.383 0.701969    
## years2013-2014         0.519587   0.435846   1.192 0.233323    
## years2015-2016         0.542718   0.448914   1.209 0.226795    
## years2017-2018        -0.367735   0.439521  -0.837 0.402858    
## years2022-2012        -0.269516   0.439556  -0.613 0.539831    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.248 on 2482 degrees of freedom
## Multiple R-squared:  0.8912, Adjusted R-squared:  0.8905 
## F-statistic:  1196 on 17 and 2482 DF,  p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + invNorm(BMXWT) + RIAGENDR + invNorm(BMXHT) + years")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.5689  -3.5363  -0.0743   3.4987  22.8113 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           95.43371    0.67691 140.983  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1  1.48881    1.18369   1.258  0.20859    
## bs(RIDAGEYR, df = 7)2  1.50636    0.84717   1.778  0.07551 .  
## bs(RIDAGEYR, df = 7)3  2.57038    0.91261   2.817  0.00489 ** 
## bs(RIDAGEYR, df = 7)4  4.45665    0.79879   5.579 2.68e-08 ***
## bs(RIDAGEYR, df = 7)5  8.00675    1.02387   7.820 7.74e-15 ***
## bs(RIDAGEYR, df = 7)6  8.20666    1.08618   7.556 5.83e-14 ***
## bs(RIDAGEYR, df = 7)7 10.02741    1.11704   8.977  < 2e-16 ***
## invNorm(BMXWT)        16.30079    0.12500 130.403  < 2e-16 ***
## RIAGENDRMale          -0.13253    0.29778  -0.445  0.65631    
## invNorm(BMXHT)        -4.54159    0.16244 -27.958  < 2e-16 ***
## years2005-2006         0.14484    0.47059   0.308  0.75827    
## years2007-2008        -0.57490    0.44330  -1.297  0.19480    
## years2009-2010        -0.24576    0.43778  -0.561  0.57460    
## years2013-2014         0.76576    0.45113   1.697  0.08974 .  
## years2015-2016         0.66445    0.46465   1.430  0.15284    
## years2017-2018        -0.03294    0.45488  -0.072  0.94228    
## years2022-2012        -0.04578    0.45500  -0.101  0.91987    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.432 on 2482 degrees of freedom
## Multiple R-squared:  0.8835, Adjusted R-squared:  0.8827 
## F-statistic:  1107 on 17 and 2482 DF,  p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, knots = seq(-20,20, by = 10), Boundary.knots = c(-30,100))+ BMXWT + RIAGENDR + years")
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.5033  -4.1369  -0.0908   4.2013  26.3945 
## 
## Coefficients: (5 not defined because of singularities)
##                                                                              Estimate
## (Intercept)                                                                 51.737411
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1         NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2         NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3         NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4         NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5 -14.506591
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6 -13.851904
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7  -3.432224
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8         NA
## BMXWT                                                                        0.726199
## RIAGENDRMale                                                                -4.712019
## years2005-2006                                                              -0.226083
## years2007-2008                                                              -0.491270
## years2009-2010                                                               0.294104
## years2013-2014                                                               0.988924
## years2015-2016                                                               1.439226
## years2017-2018                                                               0.328845
## years2022-2012                                                              -0.095229
##                                                                            Std. Error
## (Intercept)                                                                  2.307985
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1         NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2         NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3         NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4         NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5   3.393350
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6   1.404916
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7   4.484600
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8         NA
## BMXWT                                                                        0.006213
## RIAGENDRMale                                                                 0.252431
## years2005-2006                                                               0.521963
## years2007-2008                                                               0.488132
## years2009-2010                                                               0.481255
## years2013-2014                                                               0.498128
## years2015-2016                                                               0.510024
## years2017-2018                                                               0.500337
## years2022-2012                                                               0.500430
##                                                                            t value
## (Intercept)                                                                 22.417
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1      NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2      NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3      NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4      NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5  -4.275
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6  -9.860
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7  -0.765
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8      NA
## BMXWT                                                                      116.877
## RIAGENDRMale                                                               -18.667
## years2005-2006                                                              -0.433
## years2007-2008                                                              -1.006
## years2009-2010                                                               0.611
## years2013-2014                                                               1.985
## years2015-2016                                                               2.822
## years2017-2018                                                               0.657
## years2022-2012                                                              -0.190
##                                                                            Pr(>|t|)
## (Intercept)                                                                 < 2e-16
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1       NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2       NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3       NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4       NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5 1.98e-05
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6  < 2e-16
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7  0.44414
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8       NA
## BMXWT                                                                       < 2e-16
## RIAGENDRMale                                                                < 2e-16
## years2005-2006                                                              0.66495
## years2007-2008                                                              0.31431
## years2009-2010                                                              0.54118
## years2013-2014                                                              0.04722
## years2015-2016                                                              0.00481
## years2017-2018                                                              0.51108
## years2022-2012                                                              0.84909
##                                                                               
## (Intercept)                                                                ***
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1    
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2    
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3    
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4    
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5 ***
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6 ***
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7    
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8    
## BMXWT                                                                      ***
## RIAGENDRMale                                                               ***
## years2005-2006                                                                
## years2007-2008                                                                
## years2009-2010                                                                
## years2013-2014                                                             *  
## years2015-2016                                                             ** 
## years2017-2018                                                                
## years2022-2012                                                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.042 on 2487 degrees of freedom
## Multiple R-squared:  0.8555, Adjusted R-squared:  0.8549 
## F-statistic:  1228 on 12 and 2487 DF,  p-value: < 2.2e-16
## Warning in predict.lm(lm_reg, newdata = test_df, se = T): prediction from a
## rank-deficient fit may be misleading

# grid.arrange(g1, g2,g3, nrow=3)

regression models with variables

base_form <- "BMXWAIST ~ bs(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + years + "

# MFA 16:1 (Hexadecenoic) (gm)
run_model(paste0(base_form,"DR1TM161"))
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.1828  -4.0777  -0.1006   4.1779  26.8517 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           37.424802   0.877608  42.644  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1  1.341987   1.315937   1.020 0.307925    
## bs(RIDAGEYR, df = 7)2  1.437858   0.942593   1.525 0.127280    
## bs(RIDAGEYR, df = 7)3  3.371231   1.014657   3.323 0.000905 ***
## bs(RIDAGEYR, df = 7)4  5.637960   0.887545   6.352 2.51e-10 ***
## bs(RIDAGEYR, df = 7)5  9.188624   1.138373   8.072 1.07e-15 ***
## bs(RIDAGEYR, df = 7)6 10.281208   1.206022   8.525  < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 12.360987   1.240218   9.967  < 2e-16 ***
## BMXWT                  0.728377   0.006291 115.780  < 2e-16 ***
## RIAGENDRMale          -4.594644   0.257893 -17.816  < 2e-16 ***
## years2005-2006        -0.233204   0.523185  -0.446 0.655824    
## years2007-2008        -0.502333   0.493486  -1.018 0.308812    
## years2009-2010         0.269400   0.487139   0.553 0.580297    
## years2013-2014         0.933972   0.502936   1.857 0.063424 .  
## years2015-2016         1.431335   0.515918   2.774 0.005573 ** 
## years2017-2018         0.284140   0.506228   0.561 0.574652    
## years2022-2012        -0.144546   0.507571  -0.285 0.775837    
## DR1TM161              -0.338338   0.147945  -2.287 0.022285 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.04 on 2482 degrees of freedom
## Multiple R-squared:  0.8559, Adjusted R-squared:  0.8549 
## F-statistic: 867.4 on 17 and 2482 DF,  p-value: < 2.2e-16

hist(data$DR1TM161)

hist(invNorm(data$DR1TM161))

run_model(paste0(base_form,"invNorm(DR1TM161)"))
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.5429  -4.0934  -0.1196   4.1601  26.9842 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           36.962548   0.874557  42.264  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1  1.413289   1.315219   1.075 0.282674    
## bs(RIDAGEYR, df = 7)2  1.411971   0.941478   1.500 0.133809    
## bs(RIDAGEYR, df = 7)3  3.436053   1.014408   3.387 0.000717 ***
## bs(RIDAGEYR, df = 7)4  5.617117   0.886936   6.333 2.84e-10 ***
## bs(RIDAGEYR, df = 7)5  9.215022   1.137119   8.104 8.27e-16 ***
## bs(RIDAGEYR, df = 7)6 10.270878   1.205089   8.523  < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 12.344797   1.238837   9.965  < 2e-16 ***
## BMXWT                  0.728918   0.006279 116.085  < 2e-16 ***
## RIAGENDRMale          -4.563409   0.257370 -17.731  < 2e-16 ***
## years2005-2006        -0.223296   0.522759  -0.427 0.669308    
## years2007-2008        -0.503376   0.492857  -1.021 0.307191    
## years2009-2010         0.259190   0.486626   0.533 0.594340    
## years2013-2014         0.917966   0.502233   1.828 0.067704 .  
## years2015-2016         1.435866   0.515210   2.787 0.005361 ** 
## years2017-2018         0.269099   0.505649   0.532 0.594645    
## years2022-2012        -0.164762   0.506963  -0.325 0.745209    
## invNorm(DR1TM161)     -0.383826   0.128875  -2.978 0.002927 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.036 on 2482 degrees of freedom
## Multiple R-squared:  0.8561, Adjusted R-squared:  0.8552 
## F-statistic: 868.9 on 17 and 2482 DF,  p-value: < 2.2e-16

# Dietary day one sample weight
run_model(paste0(base_form,"WTDRD1"))
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.7615  -4.0387  -0.0509   4.1106  27.2154 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.805e+01  8.745e-01  43.515  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1  1.365e+00  1.306e+00   1.045  0.29607    
## bs(RIDAGEYR, df = 7)2  1.300e+00  9.347e-01   1.391  0.16430    
## bs(RIDAGEYR, df = 7)3  3.252e+00  1.007e+00   3.230  0.00125 ** 
## bs(RIDAGEYR, df = 7)4  5.849e+00  8.806e-01   6.642 3.79e-11 ***
## bs(RIDAGEYR, df = 7)5  8.792e+00  1.131e+00   7.772 1.12e-14 ***
## bs(RIDAGEYR, df = 7)6  1.035e+01  1.196e+00   8.649  < 2e-16 ***
## bs(RIDAGEYR, df = 7)7  1.188e+01  1.232e+00   9.639  < 2e-16 ***
## BMXWT                  7.262e-01  6.164e-03 117.808  < 2e-16 ***
## RIAGENDRMale          -4.736e+00  2.505e-01 -18.903  < 2e-16 ***
## years2005-2006        -1.084e-01  5.193e-01  -0.209  0.83468    
## years2007-2008        -5.364e-01  4.891e-01  -1.097  0.27289    
## years2009-2010         2.217e-01  4.828e-01   0.459  0.64611    
## years2013-2014         1.018e+00  4.973e-01   2.046  0.04082 *  
## years2015-2016         1.570e+00  5.114e-01   3.069  0.00217 ** 
## years2017-2018         4.611e-01  5.013e-01   0.920  0.35776    
## years2022-2012        -6.032e-03  5.019e-01  -0.012  0.99041    
## WTDRD1                -1.761e-05  2.638e-06  -6.675 3.03e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.993 on 2482 degrees of freedom
## Multiple R-squared:  0.8582, Adjusted R-squared:  0.8572 
## F-statistic: 883.4 on 17 and 2482 DF,  p-value: < 2.2e-16

# BMXARML - Upper Arm Length (cm)
run_model(paste0(base_form,"BMXARML"))
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.3393  -3.7212  -0.1236   3.7381  23.8669 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           68.372041   2.025362  33.758  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1  1.447563   1.247802   1.160 0.246124    
## bs(RIDAGEYR, df = 7)2  1.242831   0.893264   1.391 0.164248    
## bs(RIDAGEYR, df = 7)3  3.447961   0.961929   3.584 0.000344 ***
## bs(RIDAGEYR, df = 7)4  5.707560   0.841272   6.784 1.45e-11 ***
## bs(RIDAGEYR, df = 7)5  9.658422   1.079108   8.950  < 2e-16 ***
## bs(RIDAGEYR, df = 7)6 10.779565   1.143595   9.426  < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 13.378958   1.175477  11.382  < 2e-16 ***
## BMXWT                  0.793315   0.007112 111.540  < 2e-16 ***
## RIAGENDRMale          -2.248113   0.280564  -8.013 1.71e-15 ***
## years2005-2006        -0.048946   0.496145  -0.099 0.921422    
## years2007-2008        -0.504109   0.467288  -1.079 0.280783    
## years2009-2010        -0.448991   0.463496  -0.969 0.332786    
## years2013-2014         0.526270   0.476186   1.105 0.269189    
## years2015-2016         0.959409   0.489582   1.960 0.050149 .  
## years2017-2018        -0.160745   0.479855  -0.335 0.737663    
## years2022-2012        -0.517486   0.480432  -1.077 0.281528    
## BMXARML               -1.006012   0.059660 -16.862  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.727 on 2482 degrees of freedom
## Multiple R-squared:  0.8705, Adjusted R-squared:  0.8696 
## F-statistic: 981.1 on 17 and 2482 DF,  p-value: < 2.2e-16

#BMXLEG - Upper Leg Length (cm)
run_model(paste0(base_form,"BMXLEG"))
## 
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.8445  -3.6406  -0.1098   3.5715  26.8938 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           70.257407   1.515146  46.370  < 2e-16 ***
## bs(RIDAGEYR, df = 7)1  1.251076   1.173380   1.066 0.286430    
## bs(RIDAGEYR, df = 7)2  0.692536   0.840382   0.824 0.409977    
## bs(RIDAGEYR, df = 7)3  2.234994   0.905550   2.468 0.013650 *  
## bs(RIDAGEYR, df = 7)4  3.386435   0.796288   4.253 2.19e-05 ***
## bs(RIDAGEYR, df = 7)5  7.291548   1.017514   7.166 1.01e-12 ***
## bs(RIDAGEYR, df = 7)6  6.826024   1.083971   6.297 3.57e-10 ***
## bs(RIDAGEYR, df = 7)7  9.794273   1.109475   8.828  < 2e-16 ***
## BMXWT                  0.748006   0.005606 133.423  < 2e-16 ***
## RIAGENDRMale          -1.220653   0.263772  -4.628 3.89e-06 ***
## years2005-2006        -1.368641   0.468658  -2.920 0.003528 ** 
## years2007-2008        -2.100519   0.444221  -4.729 2.39e-06 ***
## years2009-2010        -1.589674   0.440212  -3.611 0.000311 ***
## years2013-2014        -0.307542   0.449992  -0.683 0.494393    
## years2015-2016        -0.225389   0.464366  -0.485 0.627458    
## years2017-2018        -0.739519   0.452377  -1.635 0.102230    
## years2022-2012        -1.611087   0.455193  -3.539 0.000408 ***
## BMXLEG                -0.868298   0.034154 -25.423  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.386 on 2482 degrees of freedom
## Multiple R-squared:  0.8855, Adjusted R-squared:  0.8847 
## F-statistic:  1129 on 17 and 2482 DF,  p-value: < 2.2e-16

# grid.arrange(g1, g2,g3, nrow=3)